https://stat545.com/basic-data-care.html
Laoding all the Packages needed
library(gapminder)
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.3
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 1.0.0 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
Different ways to explore a data frame, tibble
str(gapminder)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1704 obs. of 6 variables:
## $ country : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ year : int 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
## $ lifeExp : num 28.8 30.3 32 34 36.1 ...
## $ pop : int 8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
## $ gdpPercap: num 779 821 853 836 740 ...
class(gapminder)
## [1] "tbl_df" "tbl" "data.frame"
gapminder
## # A tibble: 1,704 x 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779.
## 2 Afghanistan Asia 1957 30.3 9240934 821.
## 3 Afghanistan Asia 1962 32.0 10267083 853.
## 4 Afghanistan Asia 1967 34.0 11537966 836.
## 5 Afghanistan Asia 1972 36.1 13079460 740.
## 6 Afghanistan Asia 1977 38.4 14880372 786.
## 7 Afghanistan Asia 1982 39.9 12881816 978.
## 8 Afghanistan Asia 1987 40.8 13867957 852.
## 9 Afghanistan Asia 1992 41.7 16317921 649.
## 10 Afghanistan Asia 1997 41.8 22227415 635.
## # … with 1,694 more rows
head(gapminder)
## # A tibble: 6 x 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779.
## 2 Afghanistan Asia 1957 30.3 9240934 821.
## 3 Afghanistan Asia 1962 32.0 10267083 853.
## 4 Afghanistan Asia 1967 34.0 11537966 836.
## 5 Afghanistan Asia 1972 36.1 13079460 740.
## 6 Afghanistan Asia 1977 38.4 14880372 786.
tail(gapminder)
## # A tibble: 6 x 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Zimbabwe Africa 1982 60.4 7636524 789.
## 2 Zimbabwe Africa 1987 62.4 9216418 706.
## 3 Zimbabwe Africa 1992 60.4 10704340 693.
## 4 Zimbabwe Africa 1997 46.8 11404948 792.
## 5 Zimbabwe Africa 2002 40.0 11926563 672.
## 6 Zimbabwe Africa 2007 43.5 12311143 470.
as_tibble(iris)
## # A tibble: 150 x 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
## 7 4.6 3.4 1.4 0.3 setosa
## 8 5 3.4 1.5 0.2 setosa
## 9 4.4 2.9 1.4 0.2 setosa
## 10 4.9 3.1 1.5 0.1 setosa
## # … with 140 more rows
names(gapminder)
## [1] "country" "continent" "year" "lifeExp" "pop" "gdpPercap"
summary(gapminder)
## country continent year lifeExp
## Afghanistan: 12 Africa :624 Min. :1952 Min. :23.60
## Albania : 12 Americas:300 1st Qu.:1966 1st Qu.:48.20
## Algeria : 12 Asia :396 Median :1980 Median :60.71
## Angola : 12 Europe :360 Mean :1980 Mean :59.47
## Argentina : 12 Oceania : 24 3rd Qu.:1993 3rd Qu.:70.85
## Australia : 12 Max. :2007 Max. :82.60
## (Other) :1632
## pop gdpPercap
## Min. :6.001e+04 Min. : 241.2
## 1st Qu.:2.794e+06 1st Qu.: 1202.1
## Median :7.024e+06 Median : 3531.8
## Mean :2.960e+07 Mean : 7215.3
## 3rd Qu.:1.959e+07 3rd Qu.: 9325.5
## Max. :1.319e+09 Max. :113523.1
##
# Simple plot jus to visualize dataset
plot(lifeExp ~ year, gapminder)
plot(lifeExp ~ gdpPercap, gapminder)
plot(lifeExp ~ log(gdpPercap), gapminder)
head(gapminder$lifeExp)
## [1] 28.801 30.332 31.997 34.020 36.088 38.438
summary(gapminder$lifeExp)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 23.60 48.20 60.71 59.47 70.85 82.60
hist(gapminder$lifeExp)
class(gapminder$continent)
## [1] "factor"
summary(gapminder$continent)
## Africa Americas Asia Europe Oceania
## 624 300 396 360 24
levels(gapminder$continent)
## [1] "Africa" "Americas" "Asia" "Europe" "Oceania"
nlevels(gapminder$continent)
## [1] 5
table(gapminder$continent)
##
## Africa Americas Asia Europe Oceania
## 624 300 396 360 24
barplot(table(gapminder$continent))
It is part of the tidyverse package,
Cmd+Shift+M (Mac).- shortcut for %>%
filter(gapminder, lifeExp < 29)
## # A tibble: 2 x 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779.
## 2 Rwanda Africa 1992 23.6 7290203 737.
filter(gapminder, country == "Rwanda" , year >1979)
## # A tibble: 6 x 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Rwanda Africa 1982 46.2 5507565 882.
## 2 Rwanda Africa 1987 44.0 6349365 848.
## 3 Rwanda Africa 1992 23.6 7290203 737.
## 4 Rwanda Africa 1997 36.1 7212583 590.
## 5 Rwanda Africa 2002 43.4 7852401 786.
## 6 Rwanda Africa 2007 46.2 8860588 863.
filter(gapminder, country %in% c("Rwanda", "Afghanistan"))
## # A tibble: 24 x 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779.
## 2 Afghanistan Asia 1957 30.3 9240934 821.
## 3 Afghanistan Asia 1962 32.0 10267083 853.
## 4 Afghanistan Asia 1967 34.0 11537966 836.
## 5 Afghanistan Asia 1972 36.1 13079460 740.
## 6 Afghanistan Asia 1977 38.4 14880372 786.
## 7 Afghanistan Asia 1982 39.9 12881816 978.
## 8 Afghanistan Asia 1987 40.8 13867957 852.
## 9 Afghanistan Asia 1992 41.7 16317921 649.
## 10 Afghanistan Asia 1997 41.8 22227415 635.
## # … with 14 more rows
gapminder %>%
select(year, lifeExp) %>%
head(4)
## # A tibble: 4 x 2
## year lifeExp
## <int> <dbl>
## 1 1952 28.8
## 2 1957 30.3
## 3 1962 32.0
## 4 1967 34.0
gapminder %>%
filter(country == "Cambodia") %>%
select(year, lifeExp)
## # A tibble: 12 x 2
## year lifeExp
## <int> <dbl>
## 1 1952 39.4
## 2 1957 41.4
## 3 1962 43.4
## 4 1967 45.4
## 5 1972 40.3
## 6 1977 31.2
## 7 1982 51.0
## 8 1987 53.9
## 9 1992 55.8
## 10 1997 56.5
## 11 2002 56.8
## 12 2007 59.7
### Creating a copy of gapminder so no changes are made to the original dataset
(my_gap <- gapminder)
## # A tibble: 1,704 x 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779.
## 2 Afghanistan Asia 1957 30.3 9240934 821.
## 3 Afghanistan Asia 1962 32.0 10267083 853.
## 4 Afghanistan Asia 1967 34.0 11537966 836.
## 5 Afghanistan Asia 1972 36.1 13079460 740.
## 6 Afghanistan Asia 1977 38.4 14880372 786.
## 7 Afghanistan Asia 1982 39.9 12881816 978.
## 8 Afghanistan Asia 1987 40.8 13867957 852.
## 9 Afghanistan Asia 1992 41.7 16317921 649.
## 10 Afghanistan Asia 1997 41.8 22227415 635.
## # … with 1,694 more rows
my_gap %>%
mutate(gdp = pop * gdpPercap)
## # A tibble: 1,704 x 7
## country continent year lifeExp pop gdpPercap gdp
## <fct> <fct> <int> <dbl> <int> <dbl> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779. 6567086330.
## 2 Afghanistan Asia 1957 30.3 9240934 821. 7585448670.
## 3 Afghanistan Asia 1962 32.0 10267083 853. 8758855797.
## 4 Afghanistan Asia 1967 34.0 11537966 836. 9648014150.
## 5 Afghanistan Asia 1972 36.1 13079460 740. 9678553274.
## 6 Afghanistan Asia 1977 38.4 14880372 786. 11697659231.
## 7 Afghanistan Asia 1982 39.9 12881816 978. 12598563401.
## 8 Afghanistan Asia 1987 40.8 13867957 852. 11820990309.
## 9 Afghanistan Asia 1992 41.7 16317921 649. 10595901589.
## 10 Afghanistan Asia 1997 41.8 22227415 635. 14121995875.
## # … with 1,694 more rows
Maybe it would be more meaningful to consumers of my tables and figures to stick with GDP per capita. But what if I reported GDP per capita, relative to some benchmark country. Since Canada is my adopted home, I’ll go with that.
I need to create a new variable that is gdpPercap divided by Canadian gdpPercap, taking care that I always divide two numbers that pertain to the same year.
How I achieve this:
1.Filter down to the rows for Canada. 2.Create a new temporary variable in my_gap: 3.Extract the gdpPercap variable from the Canadian data. 4.Replicate it once per country in the dataset, so it has the right length. 5.Divide raw gdpPercap by this Canadian figure. 6.Discard the temporary variable of replicated Canadian gdpPercap
summarize_at() applies the same summary function(s) to multiple variables
my_gap <- gapminder
ctib <- my_gap %>%
filter(country == "Canada")
my_gap <- my_gap %>%
mutate (tmp = rep(ctib$gdpPercap, nlevels(country)),
gdpPercapRel = gdpPercap / tmp,
tmp = NULL)
head(my_gap)
## # A tibble: 6 x 7
## country continent year lifeExp pop gdpPercap gdpPercapRel
## <fct> <fct> <int> <dbl> <int> <dbl> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779. 0.0686
## 2 Afghanistan Asia 1957 30.3 9240934 821. 0.0657
## 3 Afghanistan Asia 1962 32.0 10267083 853. 0.0634
## 4 Afghanistan Asia 1967 34.0 11537966 836. 0.0520
## 5 Afghanistan Asia 1972 36.1 13079460 740. 0.0390
## 6 Afghanistan Asia 1977 38.4 14880372 786. 0.0356
summary (my_gap$gdpPercapRel)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.007236 0.061648 0.171521 0.326659 0.446564 9.534690
### Select
my_gap %>%
filter(country == "Burundi", year > 1996) %>%
select( yr = year, lifeExp, gdpPercap) %>%
select(gdpPercap, everything())
## # A tibble: 3 x 3
## gdpPercap yr lifeExp
## <dbl> <int> <dbl>
## 1 463. 1997 45.3
## 2 446. 2002 47.4
## 3 430. 2007 49.6
#How many observations do we have per continent?
my_gap %>%
group_by(continent) %>%
summarize(n = n())
## # A tibble: 5 x 2
## continent n
## <fct> <int>
## 1 Africa 624
## 2 Americas 300
## 3 Asia 396
## 4 Europe 360
## 5 Oceania 24
# The tally() function is a convenience function that knows to count rows. It honors groups
my_gap %>%
group_by(continent) %>%
tally()
## # A tibble: 5 x 2
## continent n
## <fct> <int>
## 1 Africa 624
## 2 Americas 300
## 3 Asia 396
## 4 Europe 360
## 5 Oceania 24
# The count() function is an even more convenient function that does both grouping and counting
my_gap %>%
count(continent)
## # A tibble: 5 x 2
## continent n
## <fct> <int>
## 1 Africa 624
## 2 Americas 300
## 3 Asia 396
## 4 Europe 360
## 5 Oceania 24
#What if we wanted to add the number of unique countries for each continent?
my_gap %>%
group_by(continent) %>%
summarize ( n = n(),
n_countries = n_distinct(country))
## # A tibble: 5 x 3
## continent n n_countries
## <fct> <int> <int>
## 1 Africa 624 52
## 2 Americas 300 25
## 3 Asia 396 33
## 4 Europe 360 30
## 5 Oceania 24 2
my_gap %>%
group_by(continent) %>%
summarize(avg_lifeExp = mean(lifeExp))
## # A tibble: 5 x 2
## continent avg_lifeExp
## <fct> <dbl>
## 1 Africa 48.9
## 2 Americas 64.7
## 3 Asia 60.1
## 4 Europe 71.9
## 5 Oceania 74.3
# Let’s compute average and median life expectancy and GDP per capita by continent by year…but only for 1952 and 2007.
my_gap %>%
filter(year %in% c(1952,2007)) %>%
group_by(continent, year) %>%
summarize_at(vars(lifeExp,gdpPercap), list(~mean(.),~median(.)))
## # A tibble: 10 x 6
## # Groups: continent [5]
## continent year lifeExp_mean gdpPercap_mean lifeExp_median
## <fct> <int> <dbl> <dbl> <dbl>
## 1 Africa 1952 39.1 1253. 38.8
## 2 Africa 2007 54.8 3089. 52.9
## 3 Americas 1952 53.3 4079. 54.7
## 4 Americas 2007 73.6 11003. 72.9
## 5 Asia 1952 46.3 5195. 44.9
## 6 Asia 2007 70.7 12473. 72.4
## 7 Europe 1952 64.4 5661. 65.9
## 8 Europe 2007 77.6 25054. 78.6
## 9 Oceania 1952 69.3 10298. 69.3
## 10 Oceania 2007 80.7 29810. 80.7
## # … with 1 more variable: gdpPercap_median <dbl>
#Let’s focus just on Asia. What are the minimum and maximum life expectancies seen by year?
my_gap %>%
filter(continent == "Asia") %>%
group_by(year) %>%
summarize ( min_lifeExp = min(lifeExp), max_lifeExp = max(lifeExp))
## # A tibble: 12 x 3
## year min_lifeExp max_lifeExp
## <int> <dbl> <dbl>
## 1 1952 28.8 65.4
## 2 1957 30.3 67.8
## 3 1962 32.0 69.4
## 4 1967 34.0 71.4
## 5 1972 36.1 73.4
## 6 1977 31.2 75.4
## 7 1982 39.9 77.1
## 8 1987 40.8 78.7
## 9 1992 41.7 79.4
## 10 1997 41.8 80.7
## 11 2002 42.1 82
## 12 2007 43.8 82.6
Let’s make a new variable that is the years of life expectancy gained (lost) relative to 1952, for each individual country. We group by country and use mutate() to make a new variable. The first() function extracts the first value from a vector. Notice that first() is operating on the vector of life expectancies within each country group.
my_gap %>%
group_by(country) %>%
select(country, year, lifeExp) %>%
mutate(lifeExp_gain = lifeExp - first(lifeExp)) %>%
filter(year < 1963)
## # A tibble: 426 x 4
## # Groups: country [142]
## country year lifeExp lifeExp_gain
## <fct> <int> <dbl> <dbl>
## 1 Afghanistan 1952 28.8 0
## 2 Afghanistan 1957 30.3 1.53
## 3 Afghanistan 1962 32.0 3.20
## 4 Albania 1952 55.2 0
## 5 Albania 1957 59.3 4.05
## 6 Albania 1962 64.8 9.59
## 7 Algeria 1952 43.1 0
## 8 Algeria 1957 45.7 2.61
## 9 Algeria 1962 48.3 5.23
## 10 Angola 1952 30.0 0
## # … with 416 more rows
my_gap %>%
filter(continent == "Asia") %>%
select(year, country, lifeExp) %>%
group_by(year) %>%
filter(min_rank(desc(lifeExp)) < 2 | min_rank(lifeExp) < 2 ) %>%
arrange(year) %>%
print(n = Inf)
## # A tibble: 24 x 3
## # Groups: year [12]
## year country lifeExp
## <int> <fct> <dbl>
## 1 1952 Afghanistan 28.8
## 2 1952 Israel 65.4
## 3 1957 Afghanistan 30.3
## 4 1957 Israel 67.8
## 5 1962 Afghanistan 32.0
## 6 1962 Israel 69.4
## 7 1967 Afghanistan 34.0
## 8 1967 Japan 71.4
## 9 1972 Afghanistan 36.1
## 10 1972 Japan 73.4
## 11 1977 Cambodia 31.2
## 12 1977 Japan 75.4
## 13 1982 Afghanistan 39.9
## 14 1982 Japan 77.1
## 15 1987 Afghanistan 40.8
## 16 1987 Japan 78.7
## 17 1992 Afghanistan 41.7
## 18 1992 Japan 79.4
## 19 1997 Afghanistan 41.8
## 20 1997 Japan 80.7
## 21 2002 Afghanistan 42.1
## 22 2002 Japan 82
## 23 2007 Afghanistan 43.8
## 24 2007 Japan 82.6
## which country experienced the sharpest 5-year drop in life expectancy
my_gap %>%
select(country, year, continent, lifeExp) %>%
group_by(continent, country) %>%
mutate( le_delta = lifeExp - lag(lifeExp)) %>%
summarize( worst_le_delta = min(le_delta, na.rm = TRUE)) %>%
top_n(-1 , wt = worst_le_delta) %>%
arrange(worst_le_delta)
## # A tibble: 5 x 3
## # Groups: continent [5]
## continent country worst_le_delta
## <fct> <fct> <dbl>
## 1 Africa Rwanda -20.4
## 2 Asia Cambodia -9.10
## 3 Americas El Salvador -1.51
## 4 Europe Montenegro -1.46
## 5 Oceania Australia 0.170
library(tidyverse)
library(fs)
(gap_tsv <- path_package("gapminder","extdata","gapminder.tsv"))
## /Users/alodia/Library/R/3.6/library/gapminder/extdata/gapminder.tsv
## Rectangular data
gapminder <- read_tsv(gap_tsv)
## Parsed with column specification:
## cols(
## country = col_character(),
## continent = col_character(),
## year = col_double(),
## lifeExp = col_double(),
## pop = col_double(),
## gdpPercap = col_double()
## )
str(gapminder, give.attr = FALSE)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 1704 obs. of 6 variables:
## $ country : chr "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ continent: chr "Asia" "Asia" "Asia" "Asia" ...
## $ year : num 1952 1957 1962 1967 1972 ...
## $ lifeExp : num 28.8 30.3 32 34 36.1 ...
## $ pop : num 8425333 9240934 10267083 11537966 13079460 ...
## $ gdpPercap: num 779 821 853 836 740 ...
gapminder <- gapminder %>%
mutate(country = factor(country),
continent = factor (continent))
str(gapminder)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 1704 obs. of 6 variables:
## $ country : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ year : num 1952 1957 1962 1967 1972 ...
## $ lifeExp : num 28.8 30.3 32 34 36.1 ...
## $ pop : num 8425333 9240934 10267083 11537966 13079460 ...
## $ gdpPercap: num 779 821 853 836 740 ...
# Let’s create a country-level summary of maximum life expectancy
gap_life_exp <- gapminder %>%
group_by(country, continent) %>%
summarise(life_exp = max(lifeExp)) %>%
ungroup()
gap_life_exp
## # A tibble: 142 x 3
## country continent life_exp
## <fct> <fct> <dbl>
## 1 Afghanistan Asia 43.8
## 2 Albania Europe 76.4
## 3 Algeria Africa 72.3
## 4 Angola Africa 42.7
## 5 Argentina Americas 75.3
## 6 Australia Oceania 81.2
## 7 Austria Europe 79.8
## 8 Bahrain Asia 75.6
## 9 Bangladesh Asia 64.1
## 10 Belgium Europe 79.4
## # … with 132 more rows
write_csv(gap_life_exp, "gap_life_exp.csv")
head(levels(gap_life_exp$country))
## [1] "Afghanistan" "Albania" "Algeria" "Angola" "Argentina"
## [6] "Australia"
gap_life_exp <- gap_life_exp %>%
mutate(country = fct_reorder(country, life_exp))
head(levels(gap_life_exp$country))
## [1] "Sierra Leone" "Angola" "Afghanistan" "Liberia"
## [5] "Rwanda" "Mozambique"
gap_life_exp$country
## [1] Afghanistan Albania
## [3] Algeria Angola
## [5] Argentina Australia
## [7] Austria Bahrain
## [9] Bangladesh Belgium
## [11] Benin Bolivia
## [13] Bosnia and Herzegovina Botswana
## [15] Brazil Bulgaria
## [17] Burkina Faso Burundi
## [19] Cambodia Cameroon
## [21] Canada Central African Republic
## [23] Chad Chile
## [25] China Colombia
## [27] Comoros Congo, Dem. Rep.
## [29] Congo, Rep. Costa Rica
## [31] Cote d'Ivoire Croatia
## [33] Cuba Czech Republic
## [35] Denmark Djibouti
## [37] Dominican Republic Ecuador
## [39] Egypt El Salvador
## [41] Equatorial Guinea Eritrea
## [43] Ethiopia Finland
## [45] France Gabon
## [47] Gambia Germany
## [49] Ghana Greece
## [51] Guatemala Guinea
## [53] Guinea-Bissau Haiti
## [55] Honduras Hong Kong, China
## [57] Hungary Iceland
## [59] India Indonesia
## [61] Iran Iraq
## [63] Ireland Israel
## [65] Italy Jamaica
## [67] Japan Jordan
## [69] Kenya Korea, Dem. Rep.
## [71] Korea, Rep. Kuwait
## [73] Lebanon Lesotho
## [75] Liberia Libya
## [77] Madagascar Malawi
## [79] Malaysia Mali
## [81] Mauritania Mauritius
## [83] Mexico Mongolia
## [85] Montenegro Morocco
## [87] Mozambique Myanmar
## [89] Namibia Nepal
## [91] Netherlands New Zealand
## [93] Nicaragua Niger
## [95] Nigeria Norway
## [97] Oman Pakistan
## [99] Panama Paraguay
## [101] Peru Philippines
## [103] Poland Portugal
## [105] Puerto Rico Reunion
## [107] Romania Rwanda
## [109] Sao Tome and Principe Saudi Arabia
## [111] Senegal Serbia
## [113] Sierra Leone Singapore
## [115] Slovak Republic Slovenia
## [117] Somalia South Africa
## [119] Spain Sri Lanka
## [121] Sudan Swaziland
## [123] Sweden Switzerland
## [125] Syria Taiwan
## [127] Tanzania Thailand
## [129] Togo Trinidad and Tobago
## [131] Tunisia Turkey
## [133] Uganda United Kingdom
## [135] United States Uruguay
## [137] Venezuela Vietnam
## [139] West Bank and Gaza Yemen, Rep.
## [141] Zambia Zimbabwe
## 142 Levels: Sierra Leone Angola Afghanistan Liberia Rwanda ... Japan
### Cleaning up the files that we have created
file.remove(list.files(pattern = "^gap_life_exp"))
## [1] TRUE
library(tidyverse)
library(gapminder)
str(gapminder$continent)
## Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
nlevels(gapminder$continent)
## [1] 5
class(gapminder$continent)
## [1] "factor"
gapminder %>%
count(continent)
## # A tibble: 5 x 2
## continent n
## <fct> <int>
## 1 Africa 624
## 2 Americas 300
## 3 Asia 396
## 4 Europe 360
## 5 Oceania 24
fct_count(gapminder$continent)
## # A tibble: 5 x 2
## f n
## <fct> <int>
## 1 Africa 624
## 2 Americas 300
## 3 Asia 396
## 4 Europe 360
## 5 Oceania 24
### Make sure to drop the factors
nlevels(gapminder$country)
## [1] 142
h_countries <- c("Egypt", "Haiti", "Romania", "Thailand", "Venezuela")
h_gap <- gapminder %>%
filter(country %in% h_countries)
nlevels(h_gap$country)
## [1] 142
## where we actually drop the levels of the factor
h_gap_dropped <- h_gap %>%
droplevels()
nlevels(h_gap_dropped$country )
## [1] 5
### use forcats::fct_drop() on a free- range factor
h_gap$country %>%
fct_drop() %>%
levels()
## [1] "Egypt" "Haiti" "Romania" "Thailand" "Venezuela"
### Filter the gapminder data down to rows where population is less than a quarter of a million.Get rid of the unused factor levels for country and continent
fct_count(gapminder$country)
## # A tibble: 142 x 2
## f n
## <fct> <int>
## 1 Afghanistan 12
## 2 Albania 12
## 3 Algeria 12
## 4 Angola 12
## 5 Argentina 12
## 6 Australia 12
## 7 Austria 12
## 8 Bahrain 12
## 9 Bangladesh 12
## 10 Belgium 12
## # … with 132 more rows
fct_count(gapminder$continent)
## # A tibble: 5 x 2
## f n
## <fct> <int>
## 1 Africa 624
## 2 Americas 300
## 3 Asia 396
## 4 Europe 360
## 5 Oceania 24
names(gapminder)
## [1] "country" "continent" "year" "lifeExp" "pop" "gdpPercap"
test_gap <- gapminder %>%
filter( pop < 250000)
fct_count(test_gap$continent)
## # A tibble: 5 x 2
## f n
## <fct> <int>
## 1 Africa 26
## 2 Americas 0
## 3 Asia 7
## 4 Europe 8
## 5 Oceania 0
## should be 3 continents
test_gap$country %>%
fct_drop() %>%
levels()
## [1] "Bahrain" "Comoros" "Djibouti"
## [4] "Equatorial Guinea" "Iceland" "Kuwait"
## [7] "Sao Tome and Principe"
test_gap$continent %>%
fct_drop() %>%
levels()
## [1] "Africa" "Asia" "Europe"
### now trying to save it after getting it done
test_gap_dropped <- test_gap %>%
droplevels()
nlevels(test_gap_dropped$continent)
## [1] 3
nlevels(test_gap_dropped$country)
## [1] 7
### orderinf factor levels by frequency or by another variable
#default
gapminder$continent %>%
levels()
## [1] "Africa" "Americas" "Asia" "Europe" "Oceania"
## ordering by frequency
gapminder$continent %>%
fct_infreq() %>%
levels()
## [1] "Africa" "Asia" "Europe" "Americas" "Oceania"
## backwards- least
gapminder$continent %>%
fct_infreq() %>%
fct_rev %>%
levels()
## [1] "Oceania" "Americas" "Europe" "Asia" "Africa"
## order countries by median life expectancy
fct_reorder(gapminder$country, gapminder$lifeExp) %>%
levels() %>%
head()
## [1] "Sierra Leone" "Guinea-Bissau" "Afghanistan" "Angola"
## [5] "Somalia" "Guinea"
## order accoring to minimum life exp instead of median
fct_reorder(gapminder$country, gapminder$lifeExp, min) %>%
levels() %>%
head()
## [1] "Rwanda" "Afghanistan" "Gambia" "Angola"
## [5] "Sierra Leone" "Cambodia"
## backwards (highest life exp)
fct_reorder(gapminder$country, gapminder$lifeExp, .desc = TRUE) %>%
levels() %>%
head()
## [1] "Iceland" "Japan" "Sweden" "Switzerland" "Netherlands"
## [6] "Norway"
###### Example of why we reorder factor levels: often makes plots much better! When a factor is mapped to x or y, it should almost always be reordered by the quantitative variable you are mapping to the other one
gap_asia_2007 <- gapminder %>%
filter(year == 2007 , continent == "Asia")
ggplot(gap_asia_2007,aes(x = lifeExp, y = country)) + geom_point()
ggplot(gap_asia_2007, aes(x = lifeExp, y = fct_reorder(country, lifeExp))) + geom_point()
#Use fct_reorder2() when you have a line chart of a quantitative x against another quantitative y and your factor provides the color. This way the legend appears in some order as the data!
h_countries <- c("Egypt", "Haiti", "Romania", "Thailand", "Venezuela")
h_gap <- gapminder %>%
filter(country %in% h_countries) %>%
droplevels()
ggplot(h_gap, aes(x= year, y = lifeExp, color = country)) + geom_line()
ggplot(h_gap, aes(x= year, y = lifeExp, color = fct_reorder2(country, year, lifeExp))) + geom_line() + labs(color = "country")
### Stopped at 10.7 too tired to continue ....hmmm..hmmm
## New day Jan 22-2020 - Growing a factor
df1 <- gapminder %>%
filter(country %in% c("United States", "Mexico"), year > 2000) %>%
droplevels()
df2 <- gapminder %>%
filter(country %in% c("France","Germany"), year > 2000) %>%
droplevels()
levels(df1$country)
## [1] "Mexico" "United States"
levels (df2$country)
## [1] "France" "Germany"
## combining using fct_c
fct_c(df1$country, df2$country)
## [1] Mexico Mexico United States United States France
## [6] France Germany Germany
## Levels: Mexico United States France Germany
## Chapter 11 - Character vector
Stringr package - manipulating character vectors
Especially useful for functions that split one character vector into many and vice versa: separate(), unite(), extract(). Base functions: nchar(), strsplit(), substr(), paste(), paste0(). The glue package is fantastic for string interpolation. If stringr::str_interp() doesn’t get your job done, check out the glue package.
## Detect or filter on a target string - str_
str_detect(fruit, pattern = "fruit")
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [23] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [34] FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE
## [45] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [56] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [67] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [78] FALSE TRUE FALSE
fruit
## [1] "apple" "apricot" "avocado"
## [4] "banana" "bell pepper" "bilberry"
## [7] "blackberry" "blackcurrant" "blood orange"
## [10] "blueberry" "boysenberry" "breadfruit"
## [13] "canary melon" "cantaloupe" "cherimoya"
## [16] "cherry" "chili pepper" "clementine"
## [19] "cloudberry" "coconut" "cranberry"
## [22] "cucumber" "currant" "damson"
## [25] "date" "dragonfruit" "durian"
## [28] "eggplant" "elderberry" "feijoa"
## [31] "fig" "goji berry" "gooseberry"
## [34] "grape" "grapefruit" "guava"
## [37] "honeydew" "huckleberry" "jackfruit"
## [40] "jambul" "jujube" "kiwi fruit"
## [43] "kumquat" "lemon" "lime"
## [46] "loquat" "lychee" "mandarine"
## [49] "mango" "mulberry" "nectarine"
## [52] "nut" "olive" "orange"
## [55] "pamelo" "papaya" "passionfruit"
## [58] "peach" "pear" "persimmon"
## [61] "physalis" "pineapple" "plum"
## [64] "pomegranate" "pomelo" "purple mangosteen"
## [67] "quince" "raisin" "rambutan"
## [70] "raspberry" "redcurrant" "rock melon"
## [73] "salal berry" "satsuma" "star fruit"
## [76] "strawberry" "tamarillo" "tangerine"
## [79] "ugli fruit" "watermelon"
## Only keep the actual fruits that match
my_fruit <- str_subset(fruit, pattern = "fruit")
my_fruit
## [1] "breadfruit" "dragonfruit" "grapefruit" "jackfruit"
## [5] "kiwi fruit" "passionfruit" "star fruit" "ugli fruit"
## substring extraction (and replacement by position)
length (my_fruit)
## [1] 8
str_length(my_fruit)
## [1] 10 11 10 9 10 12 10 10
## first 3 letters
head(fruit) %>%
str_sub(1,3)
## [1] "app" "apr" "avo" "ban" "bel" "bil"
### The regex a.b will match all countries that have an a, followed by any single character, followed by b. Yes, regexes are case sensitive, i.e. “Italy” does not match.
countries <- levels(gapminder$country)
str_subset(countries, pattern = "i.a")
## [1] "Argentina" "Bosnia and Herzegovina"
## [3] "Burkina Faso" "Central African Republic"
## [5] "China" "Costa Rica"
## [7] "Dominican Republic" "Hong Kong, China"
## [9] "Jamaica" "Mauritania"
## [11] "Nicaragua" "South Africa"
## [13] "Swaziland" "Taiwan"
## [15] "Thailand" "Trinidad and Tobago"
## Anchors can be included to express where the expression must occur within the string. The ^ indicates the beginning of string and $ indicates the end.
str_subset(countries, pattern = "i.a$")
## [1] "Argentina" "Bosnia and Herzegovina"
## [3] "China" "Costa Rica"
## [5] "Hong Kong, China" "Jamaica"
## [7] "South Africa"
str_subset(my_fruit, pattern = "d")
## [1] "breadfruit" "dragonfruit"
str_subset(my_fruit, pattern = "^d")
## [1] "dragonfruit"
## 13 - Dates and Times
library(tidyverse)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
Sys.Date()
## [1] "2020-01-23"
today()
## [1] "2020-01-23"
str(Sys.Date())
## Date[1:1], format: "2020-01-23"
class(Sys.Date())
## [1] "Date"
str(today())
## Date[1:1], format: "2020-01-23"
class(today())
## [1] "Date"
Sys.time()
## [1] "2020-01-23 13:34:43 CST"
now()
## [1] "2020-01-23 13:34:43 CST"
str(Sys.time())
## POSIXct[1:1], format: "2020-01-23 13:34:43"
class(Sys.time())
## [1] "POSIXct" "POSIXt"
str(now())
## POSIXct[1:1], format: "2020-01-23 13:34:43"
class(now())
## [1] "POSIXct" "POSIXt"
### Column binding
library(gapminder)
life_exp <- gapminder %>%
select(country, year,lifeExp)
pop <- gapminder %>%
arrange(year) %>%
select(pop)
gdp_percap <- gapminder %>%
arrange(pop) %>%
select(gdpPercap)
(gapminder_garbage <- bind_cols(life_exp, pop,gdp_percap))
## # A tibble: 1,704 x 5
## country year lifeExp pop gdpPercap
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 Afghanistan 1952 28.8 8425333 880.
## 2 Afghanistan 1957 30.3 1282697 861.
## 3 Afghanistan 1962 32.0 9279525 2670.
## 4 Afghanistan 1967 34.0 4232095 1072.
## 5 Afghanistan 1972 36.1 17876956 1385.
## 6 Afghanistan 1977 38.4 8691212 2865.
## 7 Afghanistan 1982 39.9 6927772 1533.
## 8 Afghanistan 1987 40.8 120447 1738.
## 9 Afghanistan 1992 41.7 46886859 3021.
## 10 Afghanistan 1997 41.8 8730405 1890.
## # … with 1,694 more rows
summary(gapminder$lifeExp)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 23.60 48.20 60.71 59.47 70.85 82.60
summary(gapminder_garbage$lifeExp)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 23.60 48.20 60.71 59.47 70.85 82.60
range(gapminder$gdpPercap)
## [1] 241.1659 113523.1329
range(gapminder_garbage$gdpPercap)
## [1] 241.1659 113523.1329
#### 18 - Writing your own functions, part 1
Build that skateboard before you build the car or some fancy car part. A limited-but-functioning thing is very useful. It also keeps the spirits high
The special argument … is very useful when you want the ability to pass arbitrary arguments down to another function, but without constantly expanding the formal arguments to your function. This leaves you with a less cluttered function definition and gives you future flexibility to specify these arguments only when you need to.
Until now, we’ve relied on informal tests of our evolving function. If you are going to use a function a lot, especially if it is part of a package, it is wise to use formal unit tests.
The testthat package (CRAN; GitHub) provides excellent facilities for this, with a distinct emphasis on automated unit testing of entire packages. However, we can take it out for a test drive even with our one measly function.
We will construct a test with test_that()
library(gapminder)
str(gapminder)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 1704 obs. of 6 variables:
## $ country : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ year : num 1952 1957 1962 1967 1972 ...
## $ lifeExp : num 28.8 30.3 32 34 36.1 ...
## $ pop : num 8425333 9240934 10267083 11537966 13079460 ...
## $ gdpPercap: num 779 821 853 836 740 ...
min(gapminder$lifeExp)
## [1] 23.599
max(gapminder$lifeExp)
## [1] 82.603
range(gapminder$lifeExp)
## [1] 23.599 82.603
### first function
max_minus_min <- function(x) max(x) - min(x)
max_minus_min(gapminder$lifeExp)
## [1] 59.004
max_minus_min(1:10)
## [1] 9
max_minus_min(runif(1000))
## [1] 0.9917401
### stop if not
mmm <- function(x) {
stopifnot(is.numeric(x))
max(x) - min(x)
}
mmm(gapminder)
## Error in mmm(gapminder): is.numeric(x) is not TRUE
mmm2 <- function(x){
if(!is.numeric(x)) {
stop('I am so sorry, but this function only works for numeric input!\n',
'You have provided an object of class: ', class(x)[1])
}
max(x)- min(x)
}
mmm2(gapminder)
## Error in mmm2(gapminder): I am so sorry, but this function only works for numeric input!
## You have provided an object of class: spec_tbl_df
quantile(gapminder$lifeExp)
## 0% 25% 50% 75% 100%
## 23.5990 48.1980 60.7125 70.8455 82.6030
quantile(gapminder$lifeExp, probs = .50)
## 50%
## 60.7125
median(gapminder$lifeExp)
## [1] 60.7125
quantile(gapminder$lifeExp, probs = c(.25,.75))
## 25% 75%
## 48.1980 70.8455
boxplot(gapminder$lifeExp, plot = FALSE)$stats
## [,1]
## [1,] 23.5990
## [2,] 48.1850
## [3,] 60.7125
## [4,] 70.8460
## [5,] 82.6030
### Now write a code snippet that takes the difference between two quantiles.
the_probs <- c(.25,.75)
the_quantiles <- quantile(gapminder$lifeExp, probs = the_probs )
max(the_quantiles) - min(the_quantiles)
## [1] 22.6475
qdiff1 <- function(x,probs) {
stopifnot(is.numeric(x))
the_quantiles <- quantile( x = x , probs = probs)
max(the_quantiles) - min(the_quantiles)
}
qdiff1(gapminder$lifeExp, probs = c(.25,.75))
## [1] 22.6475
IQR(gapminder$lifeExp)
## [1] 22.6475
qdiff3 <- function(x, probs = c(0, 1)) {
stopifnot(is.numeric(x))
the_quantiles <- quantile(x, probs)
max(the_quantiles) - min(the_quantiles)
}
qdiff3(gapminder$lifeExp)
## [1] 59.004
z <- gapminder$lifeExp
z[3] <- NA
quantile(gapminder$lifeExp)
## 0% 25% 50% 75% 100%
## 23.5990 48.1980 60.7125 70.8455 82.6030
quantile(z)
## Error in quantile.default(z): missing values and NaN's not allowed if 'na.rm' is FALSE
quantile(z, na.rm = TRUE)
## 0% 25% 50% 75% 100%
## 23.599 48.228 60.765 70.846 82.603
### Dealing with NA'S within our quantile function
qdiff4 <- function(x, probs = c(0, 1)) {
stopifnot(is.numeric(x))
the_quantiles <- quantile(x, probs, na.rm = TRUE)
max(the_quantiles) - min(the_quantiles)
}
qdiff4(gapminder$lifeExp)
## [1] 59.004
qdiff4(z)
## [1] 59.004
### Dont give defaults, on this function we set na.rm = na.rm
qdiff5 <- function( x, probs = c(0,1), na.rm = TRUE) {
stopifnot(is.numeric(x))
the_quantiles <- quantile(x, probs, na.rm = na.rm)
max(the_quantiles) - min(the_quantiles)
}
qdiff5(gapminder$lifeExp)
## [1] 59.004
qdiff5(z)
## [1] 59.004
qdiff5(z, na.rm = FALSE)
## Error in quantile.default(x, probs, na.rm = na.rm): missing values and NaN's not allowed if 'na.rm' is FALSE
#### we can use ellipsis in function to give the end user freedom to add to the function
qdiff6 <- function( x, probs = c(0,1), na.rm = TRUE, ...) {
stopifnot(is.numeric(x))
the_quantiles <- quantile(x, probs, na.rm = na.rm )
max(the_quantiles) - min(the_quantiles)
}
### showing difference between type 1 and type 2
set.seed(1234)
z <- rnorm(10)
quantile(z, type = 1)
## 0% 25% 50% 75% 100%
## -2.3456977 -0.8900378 -0.5644520 0.4291247 1.0844412
quantile(z, type = 3)
## 0% 25% 50% 75% 100%
## -2.3456977 -1.2070657 -0.5644520 0.4291247 1.0844412
all.equal(quantile(z, type = 1), quantile(z,type = 3))
## [1] "Mean relative difference: 0.356196"
qdiff6(z, probs = c(0.25,.075), type = 1)
## [1] 0.7659078
qdiff6(z, probs = c(0.25,.075), type = 3)
## [1] 0.7659078
#### Let’s revert to a version of our function that does no NA handling, then test for proper NA handling. We can watch it fail.
library(testthat)
##
## Attaching package: 'testthat'
## The following object is masked from 'package:dplyr':
##
## matches
## The following object is masked from 'package:purrr':
##
## is_null
## The following object is masked from 'package:tidyr':
##
## matches
qdiff_no_NA_no <- function(x, probs = c(0, 1)) {
the_quantiles <- quantile(x = x, probs = probs)
max(the_quantiles) - min(the_quantiles)
}
### doesnt work and test that detects it correctly
test_that('Na handling works' , {
expect_that(qdiff_no_NA_no(c(1:5,NA)), equals(4))
})
## Error: Test failed: 'Na handling works'
## * missing values and NaN's not allowed if 'na.rm' is FALSE
## 1: expect_that(qdiff_no_NA_no(c(1:5, NA)), equals(4)) at <text>:144
## 2: condition(object)
## 3: expect_equal(x, expected, ..., expected.label = label)
## 4: quasi_label(enquo(object), label, arg = "object")
## 5: eval_bare(get_expr(quo), get_env(quo))
## 6: qdiff_no_NA_no(c(1:5, NA))
## 7: quantile(x = x, probs = probs) at <text>:138
## 8: quantile.default(x = x, probs = probs)
## 9: stop("missing values and NaN's not allowed if 'na.rm' is FALSE")
##works
qdiff_no_NA <- function(x, probs = c(0, 1), na.rm = TRUE) {
the_quantiles <- quantile(x = x, probs = probs, na.rm = na.rm)
max(the_quantiles) - min(the_quantiles)
}
test_that('Na handling works' , {
expect_that(qdiff_no_NA(c(1:5,NA)), equals(4))
})
library(gapminder)
library(ggplot2)
library(dplyr)
## getting data to practice
j_country <- "France"
(j_dat <- gapminder %>%
filter(country == j_country))
## # A tibble: 12 x 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <dbl> <dbl> <dbl> <dbl>
## 1 France Europe 1952 67.4 42459667 7030.
## 2 France Europe 1957 68.9 44310863 8663.
## 3 France Europe 1962 70.5 47124000 10560.
## 4 France Europe 1967 71.6 49569000 13000.
## 5 France Europe 1972 72.4 51732000 16107.
## 6 France Europe 1977 73.8 53165019 18293.
## 7 France Europe 1982 74.9 54433565 20294.
## 8 France Europe 1987 76.3 55630100 22066.
## 9 France Europe 1992 77.5 57374179 24704.
## 10 France Europe 1997 78.6 58623428 25890.
## 11 France Europe 2002 79.6 59925035 28926.
## 12 France Europe 2007 80.7 61083916 30470.
## Always always always plot the data
p <- ggplot(j_dat, aes(x = year, y = lifeExp))
p + geom_point() + geom_smooth(method = "lm", se = FALSE)
##Fit the regression - sanity check - change the intercept
j_fit <- lm(lifeExp ~ year , j_dat)
coef(j_fit)
## (Intercept) year
## -397.7646019 0.2385014
## coef didnt make sense because it says at year 0 life expectancy was at -397.76 so we will move it to start that the beginning of the data 1952....I() function which “inhibits interpretation/conversion of objects”
j_fit <- lm(lifeExp ~ I(year - 1952), j_dat)
coef(j_fit)
## (Intercept) I(year - 1952)
## 67.7901282 0.2385014
### Coefficient of 67.79 makes much more sense
### turn working code into a function
le_lin_fit <- function ( dat, offset = 1952) {
the_fit <- lm(lifeExp ~ I(year - offset), dat)
coef(the_fit)
}
le_lin_fit(j_dat)
## (Intercept) I(year - offset)
## 67.7901282 0.2385014
## its messy and we need to fix the names on it
le_lin_fit <- function ( dat, offset = 1952) {
the_fit <- lm(lifeExp ~ I(year - offset), dat)
setNames(coef(the_fit), c("intercept", "slope"))
}
le_lin_fit(j_dat)
## intercept slope
## 67.7901282 0.2385014
### testing the function on another country
j_country <- "Zimbabwe"
(j_dat <- gapminder %>%
filter( country == j_country))
## # A tibble: 12 x 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <dbl> <dbl> <dbl> <dbl>
## 1 Zimbabwe Africa 1952 48.5 3080907 407.
## 2 Zimbabwe Africa 1957 50.5 3646340 519.
## 3 Zimbabwe Africa 1962 52.4 4277736 527.
## 4 Zimbabwe Africa 1967 54.0 4995432 570.
## 5 Zimbabwe Africa 1972 55.6 5861135 799.
## 6 Zimbabwe Africa 1977 57.7 6642107 686.
## 7 Zimbabwe Africa 1982 60.4 7636524 789.
## 8 Zimbabwe Africa 1987 62.4 9216418 706.
## 9 Zimbabwe Africa 1992 60.4 10704340 693.
## 10 Zimbabwe Africa 1997 46.8 11404948 792.
## 11 Zimbabwe Africa 2002 40.0 11926563 672.
## 12 Zimbabwe Africa 2007 43.5 12311143 470.
### visualizing
p <- ggplot(j_dat, aes(x = year, y = lifeExp)) + geom_point()
p + geom_smooth(method = "lm", se = FALSE)
## coefficients
j_fit <- lm(lifeExp ~ year , j_dat)
coefficients(j_fit)
## (Intercept) year
## 236.79819464 -0.09302098
### create a function
le_lin_fit(j_dat)
## intercept slope
## 55.22124359 -0.09302098
### It’s also a good idea to clean out the workspace, rerun the minimum amount of code, and retest your function. This will help you catch another common mistake: accidentally relying on objects that were lying around in the workspace during development but that are not actually defined in your function nor passed as formal arguments.
rm(list = ls())
le_lin_fit <- function(dat, offset = 1952) {
the_fit <- lm(lifeExp ~ I(year - offset), dat)
setNames(coef(the_fit), c("Intercept", "Slope"))
}
le_lin_fit(gapminder %>% filter(country == "Zimbabwe"))
## Intercept Slope
## 55.22124359 -0.09302098
le_lin_fit(gapminder %>% filter(country == "Kenya"))
## Intercept Slope
## 47.0020385 0.2065077
### GGplot tutorial - https://github.com/jennybc/ggplot2-tutorial/blob/master/gapminder-ggplot2-scatterplot.md
library(tibble)
library(ggplot2)
library(gapminder)
gapminder
## # A tibble: 1,704 x 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779.
## 2 Afghanistan Asia 1957 30.3 9240934 821.
## 3 Afghanistan Asia 1962 32.0 10267083 853.
## 4 Afghanistan Asia 1967 34.0 11537966 836.
## 5 Afghanistan Asia 1972 36.1 13079460 740.
## 6 Afghanistan Asia 1977 38.4 14880372 786.
## 7 Afghanistan Asia 1982 39.9 12881816 978.
## 8 Afghanistan Asia 1987 40.8 13867957 852.
## 9 Afghanistan Asia 1992 41.7 16317921 649.
## 10 Afghanistan Asia 1997 41.8 22227415 635.
## # … with 1,694 more rows
### empty plot - skeletons
ggplot(gapminder, aes(x = gdpPercap, y = lifeExp))
p <- ggplot(gapminder, aes(x = gdpPercap, y = lifeExp))
### add points
ggplot(gapminder, aes(x = gdpPercap, y = lifeExp)) + geom_point()
p + geom_point()
## Log transformation
ggplot(gapminder, aes(x = log10(gdpPercap), y = lifeExp)) + geom_point()
###### but better way to use transformations #####
p + geom_point() + scale_x_log10()
p <- p + geom_point() + scale_x_log10()
## Map Continent as a variable to aesthetic
p+ geom_point(aes(color = continent))
### Let's add a summary
plot(gapminder, aes( x = gdpPercap, y = lifeExp , color = continent)) + geom_point() + scale_x_log10()
## NULL
### addressing overplotting: Set Alpha transparancy and size to a particular value
p+ geom_point()
p + geom_point(alpha = (1/3), size = 3)
## Add a fitted curve or line
p + geom_point() + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
p + geom_point()+ geom_smooth( lwd = 3 , se = FALSE)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
p + geom_point()+ geom_smooth( lwd = 3 , se = FALSE, method = "lm")
### Let's bring back our continents
p + aes(color = continent) + geom_point() + geom_smooth( lwd = 3 , se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#### Facetting is just another way to exploit a factor
p + geom_point(alpha = (1/8) , size = 3) + facet_wrap (~ continent)
p + geom_point(alpha = (1/8) , size = 3) + facet_wrap (~ continent) + geom_smooth( lwd = 2, se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#### Let's plot lifeExp against Year
ggplot(gapminder, aes( x= year, y = lifeExp , color = continent)) + geom_jitter( alpha = 1/3 , size = 3)
### Making mini plots, split out by continent
ggplot(gapminder, aes(x = year, y = lifeExp, color = continent)) + facet_wrap(~continent, scales = "free_x") + geom_jitter(alpha = 1/3, size = 3) + scale_color_manual(values = continent_colors)
ggplot(subset(gapminder, continent != "Oceania"),
aes(x = year, y = lifeExp , group = country , color =country)) +
geom_line(lwd = 1 , show_guide = FALSE ) + facet_wrap(~continent) +
scale_color_manual(values = country_colors) +
theme_bw() + theme(strip.text = element_text (size = rel(1.1)))
## Warning: `show_guide` has been deprecated. Please use `show.legend`
## instead.
### Subsetting data
ggplot(subset (gapminder, country == "Zimbabwe"), aes(x = year, y = lifeExp)) + geom_line() + geom_point()
##can also get the same result with dplyr::filter
suppressPackageStartupMessages(library(dplyr))
ggplot(gapminder %>% filter(country == "Zimbabwe"),
aes(x = year, y = lifeExp)) + geom_line() + geom_point()
jCountries <- c("Canada", "Rwanda", "Cambodia", "Mexico")
ggplot(subset(gapminder, country %in% jCountries),
aes(x = year, y = lifeExp, color = country)) + geom_line() + geom_point()
ggplot(gapminder, aes(x = gdpPercap, y = lifeExp)) +
scale_x_log10() + geom_bin2d()
sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] testthat_2.2.1 lubridate_1.7.4 fs_1.3.1 forcats_0.4.0
## [5] stringr_1.4.0 dplyr_0.8.3 purrr_0.3.3 readr_1.3.1
## [9] tidyr_1.0.0 tibble_2.1.3 ggplot2_3.2.1 tidyverse_1.3.0
## [13] gapminder_0.3.0
##
## loaded via a namespace (and not attached):
## [1] tidyselect_0.2.5 xfun_0.10 splines_3.6.1 haven_2.2.0
## [5] lattice_0.20-38 colorspace_1.4-1 vctrs_0.2.0 generics_0.0.2
## [9] htmltools_0.4.0 mgcv_1.8-28 yaml_2.2.0 utf8_1.1.4
## [13] rlang_0.4.1 pillar_1.4.2 glue_1.3.1 withr_2.1.2
## [17] DBI_1.0.0 dbplyr_1.4.2 modelr_0.1.5 readxl_1.3.1
## [21] lifecycle_0.1.0 munsell_0.5.0 gtable_0.3.0 cellranger_1.1.0
## [25] rvest_0.3.5 evaluate_0.14 labeling_0.3 knitr_1.25
## [29] fansi_0.4.0 broom_0.5.2 Rcpp_1.0.2 scales_1.0.0
## [33] backports_1.1.5 jsonlite_1.6 hms_0.5.2 digest_0.6.22
## [37] stringi_1.4.3 grid_3.6.1 cli_1.1.0 tools_3.6.1
## [41] magrittr_1.5 lazyeval_0.2.2 crayon_1.3.4 pkgconfig_2.0.3
## [45] zeallot_0.1.0 Matrix_1.2-17 ellipsis_0.3.0 xml2_1.2.2
## [49] reprex_0.3.0 assertthat_0.2.1 rmarkdown_1.16 httr_1.4.1
## [53] rstudioapi_0.10 R6_2.4.0 nlme_3.1-140 compiler_3.6.1